## āā Attaching packages āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse 1.3.1 āā
## ā ggplot2 3.3.5 ā purrr 0.3.4
## ā tibble 3.1.6 ā dplyr 1.0.7
## ā tidyr 1.1.4 ā stringr 1.4.0
## ā readr 2.1.1 ā forcats 0.5.1
## āā Conflicts āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse_conflicts() āā
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'textshape'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:purrr':
##
## flatten
## The following object is masked from 'package:tibble':
##
## column_to_rownames
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: timeDate
## Loading required package: timeSeries
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:psych':
##
## outlier
##
## Attaching package: 'fBasics'
## The following object is masked from 'package:psych':
##
## tr
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:timeSeries':
##
## time<-
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following objects are masked from 'package:timeDate':
##
## kurtosis, skewness
## The following object is masked from 'package:graphics':
##
## legend
df <- read_excel("~/Desktop/Market Index Portfolios/Bases de Datos/Criterios-Unificado.xlsx")
df2 <- subset(df, Country!="China" & Country!="United States")
df2 <- column_to_rownames(df2, loc = 1)
# Data Frame with all variables and without China and USA
df <- column_to_rownames(df, loc = 1)
# Data Frame with all variables and observations
describe(df)
## vars n mean sd median trimmed mad
## GCI 1 122 6.127000e+01 1.250000e+01 6.145000e+01 6.127000e+01 1.32000e+01
## ECI 2 122 1.400000e-01 9.600000e-01 8.000000e-02 1.400000e-01 1.10000e+00
## EDBI 3 122 6.685000e+01 1.217000e+01 6.825000e+01 6.761000e+01 1.42300e+01
## GDP 4 122 6.980746e+11 2.402388e+12 8.917925e+10 2.350047e+11 1.14957e+11
## min max range skew kurtosis se
## GCI 35.10 8.480000e+01 4.970000e+01 0.04 -0.82 1.13000e+00
## ECI -1.91 2.200000e+00 4.120000e+00 0.05 -0.82 9.00000e-02
## EDBI 35.20 8.700000e+01 5.180000e+01 -0.47 -0.56 1.10000e+00
## GDP 2366213069.00 2.143322e+13 2.143086e+13 6.83 51.00 2.17502e+11
df1 <- df[,-4]
# Data Frame without GDP and with USA and China
cor.plot(df)
cor.plot(df1)
cor.plot(df2)
Todas las variables tienen una significante correlación entre si, excepto por el GDP. La correlación entre GDP y las otras variables aumenta cuando eliminamos a los outliers
pca_df<- prcomp(df)
pca_df1<- prcomp(df1)
pca_df2<- prcomp(df2)
summary(pca_df)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 2.402e+12 16.11 4.434 0.4431
## Proportion of Variance 1.000e+00 0.00 0.000 0.0000
## Cumulative Proportion 1.000e+00 1.00 1.000 1.0000
summary(pca_df1)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 16.8653 4.52786 0.44407
## Proportion of Variance 0.9322 0.06719 0.00065
## Cumulative Proportion 0.9322 0.99935 1.00000
summary(pca_df2)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 7.892e+11 15.16 4.268 0.4358
## Proportion of Variance 1.000e+00 0.00 0.000 0.0000
## Cumulative Proportion 1.000e+00 1.00 1.000 1.0000
Importancia del componente principal porcentaje de la varianza que se explica con el PCA1
df_PC1234 <- cbind(df, pca_df$x)
df_PC1234_Descent <- df_PC1234 %>%
arrange(desc(PC1))
df1_PC123 <- cbind(df1, pca_df1$x)
df1_PC123_Descent <- df1_PC123 %>%
arrange(desc(PC1))
df2_PC1234 <- cbind(df2, pca_df2$x)
df2_PC1234_Descent <- df2_PC1234 %>%
arrange(desc(PC1))
fviz_pca_ind(pca_df,
repel = TRUE,
title = "Place of each country in a PC1 and PC2 Map [With GDP]")
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#### 3.1.2. Mapa Cartesiano con PC1 y PC2 como ejes para visualizar la posición de los paises (con GDP).
fviz_pca_ind(pca_df1,
repel = TRUE,
title = "Place of each country in a PC1 and PC2 Map [Without GDP]")
## Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#### 3.1.3. Mapa Cartesiano con PC1 y PC2 como ejes para visualizar la posición de los paises (con GDP).
fviz_pca_ind(pca_df2,
repel = TRUE,
title = "Place of each country [With GDP and without USA & China]")
## Warning: ggrepel: 95 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
### 3.2. GrƔfica de individuos y variables.
fviz_pca_biplot(pca_df)
fviz_pca_biplot(pca_df1)
fviz_pca_biplot(pca_df2)
### 3.3. Contribución de varianza de variables y de Componentes Principales
fviz_contrib(pca_df, choice = "var",
title = "Percentage of Variance contribution With Variable GDP",
addlabels = TRUE)
fviz_contrib(pca_df1, choice = "var",
title = "Percentage of Variance contribution Without Variable GDP",
addlabels = TRUE)
fviz_contrib(pca_df2, choice = "var",
title = "With Variable GDP and without USA & China",
addlabels = TRUE)
### 3.4. Porcentaje de varianza explicada por cada Componente Principal
fviz_screeplot(pca_df,
title = "4 Principal Components With USA & China",
addlabels = TRUE)
fviz_screeplot(pca_df1,
title = "3 Principal Components Without GDP",
addlabels = TRUE) # Porcentaje de la varianza explicada con el PCA1
fviz_screeplot(pca_df2,
title = "4 Principal Components Without USA & China",
addlabels = TRUE)
### 3.5. Eigen value
fviz_eig(pca_df, choice = "eigenvalue",
addlabels = TRUE,
title = "e")
fviz_eig(pca_df1, choice = "eigenvalue",
addlabels = TRUE,
title = "e")
fviz_eig(pca_df2, choice = "eigenvalue",
addlabels = TRUE,
title = "e")
### 3.6. Matriz de correlaciones con los Componentes Principales
cor.plot(df_PC1234)
cor.plot(df1_PC123)
cor.plot(df2_PC1234)
A partir de los resultados del PCA concluimos que debemos de excluir la variable GDP y mantener a Estados Unido y China. Ahora se harĆ” un agrupamiento de la Data Frame 1 para encontrar los grupos de paises.
Crear grupos de 5 paises con base en los 3 Componentes Principales de la Data Frame 1
map <- fviz_pca_ind(pca_df1)
map
Kdf1_Descent = subset(df1_PC123_Descent, select = c("PC1","PC2", "PC3"))
head(Kdf1_Descent, 5)
## PC1 PC2 PC3
## Singapore 30.12732 2.7728809 -0.04592890
## Hong Kong SAR 28.38455 2.0727765 0.59122781
## United States 27.79992 3.5847563 0.05222266
## Denmark 27.09359 0.6828848 0.43650638
## United Kindom 26.00105 1.8451193 0.02403078
df1_scaled <- scale(df1_PC123_Descent)
set.seed(123)
Elbow Method
fviz_nbclust(df1_scaled,
kmeans,
method = "wss",
k.max = 24)
fviz_nbclust(df1_scaled,
kmeans,
method = "gap_stat",
k.max = 30)
Gap Method
#calculate gap statistic based on number of clusters
gap_stat <- clusGap(df1_scaled,
FUN = kmeans,
nstart = 25,
K.max = 30,
B = 50)
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)
Agrupacion por clusters
km <- kmeans(Kdf1_Descent, centers = 4, iter.max = 20, nstart = 20)
fviz_cluster(km, data = Kdf1_Descent)#Grafica K means
Agrupación de paises en listas de 5
library(cluster)
library(factoextra) #Para graficar K-Means y PCA
library(psych)
library(stats) #Para hacer el PCA
library(naniar) #Para limpiar las bases de datos
library(fBasics) #Analisis estadistico
library(aTSA) #Raiz Unitaria
##
## Attaching package: 'aTSA'
## The following object is masked from 'package:graphics':
##
## identify
library(tseries) #Raiz Unitaria
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'tseries'
## The following objects are masked from 'package:aTSA':
##
## adf.test, kpss.test, pp.test
library(PerformanceAnalytics)
library(QuantPsyc) #Pruba multivariada
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
library(statmod)
library(ghyp)
## Loading required package: numDeriv
##
## Attaching package: 'ghyp'
## The following object is masked from 'package:caret':
##
## sensitivity
library(cramer) #Para la prueba de cramer